library(scran)
library(scater)
library(pheatmap)
library(tidyverse)
# BiocManager::install("scRNAseq")
figure.path <- "C:/Users/jessb/OneDrive/MS-CB/Functional Interpretation of High-Throughput Data/Assignments/Assignment 4"
## Load pancreas data from Baron etal. (2016)
## C:\Users\jessb\AppData\Local\ExperimentHub\ExperimentHub\Cache
baron.sce <- scRNAseq::BaronPancreasData(which = 'human')
## Pancreas cells gene markers. Taken from Table S2, from Baron etal, Cell Syst. 2016 Oct 26;3(4):346-360.e4.
cell.markers <- tibble(cell=c("Alpha", "Beta", "Delta", "Gamma","Epsilon", "Ductal",
"Acinar", "Stellate","Vascular", "Macrophage", "CytotoxicT",
"Mast"),
gene=c("GCG", "INS", "SST", "PPY", "GHRL", "KRT19",
"CPA1" ,"PDGFRB", "VWF", "CD163", "CD3D", "TPSAB1"))
## Additional pancreas from Grun etal. (2016)
# grun.sce <- scRNAseq::GrunPancreasData()
Annotate <- function(sce.obj)
{
## use anyone of these packages to annotate the genes for
## ids, genomic location and description
# library(Organism.dplyr)
# library(EnsDb.Hsapiens.v86)
library(AnnotationHub)
ens.GRCh38 <- AnnotationHub()[["AH73881"]]
genenames <- rownames(sce.obj)
geneids <- mapIds(ens.GRCh38,
keys = genenames,
keytype = "GENENAME",
column = "GENEID")
locations <- mapIds(ens.GRCh38,
keys = genenames,
keytype = "GENENAME",
column = "SEQNAME")
descriptions <- mapIds(ens.GRCh38,
keys = genenames,
keytype = "GENENAME",
column = "DESCRIPTION")
##Add gene annotation
rowData(sce.obj)$geneids <- geneids
rowData(sce.obj)$locations <- locations
rowData(sce.obj)$descriptions <- descriptions
return(sce.obj)
}
## Annotate
baron.sce <- Annotate(baron.sce)
## Warning: Unable to map 1280 of 20125 requested IDs.
## Warning: Unable to map 1280 of 20125 requested IDs.
## Warning: Unable to map 1280 of 20125 requested IDs.
FilterNQC <- function(sce.obj, plot.path=NULL, max_mito_frac=0.15)
{
library(ggplot2)
# max_mito_frac <- 0.15
## find mitochondrial genes
is.mito <- which(rowData(sce.obj)$locations=="MT")
## compute cell QC
qc.cell <- perCellQCMetrics(sce.obj, subsets=list(Mito=is.mito))
qc.cell.df <- cbind(as.numeric(qc.cell$sum),
as.numeric(qc.cell$detected),
as.numeric(qc.cell$subsets_Mito_percent),
colData(sce.obj)$donor)
qc.cell.df <- as.data.frame(qc.cell.df, row.names = colnames(sce.obj))
colnames(qc.cell.df) = c('sum', 'detected', 'mito_percent', 'donor')
## compute gene QC
# qc.gene <- apply(assay(sce.obj), 2, function(x) length(which(x>0)))
# qc.cell.df <- cbind(qc.cell.df, qc.gene)
# colnames(qc.cell.df) = c('sum', 'detected', 'mito_percent', 'donor', 'genes')
## violin plot
violin_cell <- ggplot(qc.cell.df, aes(x=donor, y=mito_percent)) +
geom_violin(trim=FALSE) +
labs(y = "% of UMIs mitochondrial", x="Donors")
ggsave("violin_cell.png", plot = violin_cell, path = plot.path)
violin_gene <- ggplot(qc.cell.df, aes(x=donor, y=detected)) +
geom_violin(trim=FALSE) +
labs(y = "Number of genes expressed", x="Donors")
ggsave("violin_gene.png", plot = violin_gene, path = plot.path)
violin_umi <- ggplot(qc.cell.df, aes(x=donor, y=sum)) +
geom_violin(trim=FALSE) +
labs(y = "Number of genes expressed", x="Donors")
ggsave("violin_umi.png", plot = violin_umi, path = plot.path)
## use `quickPerCellQC` function for filtering cells
## Add diagnostic plot
discarded <- quickPerCellQC(qc.cell,
percent_subsets=c("subsets_Mito_percent",
"detected"))
colData(sce.obj) <- cbind(colData(sce.obj), qc.cell, discarded)
## filter by low number of detected genes & plot
print(plotColData(sce.obj, x="donor", y="detected",
colour_by="low_lib_size"))
print(plotColData(sce.obj, x="donor", y="detected",
colour_by="low_n_features"))
print(plotColData(sce.obj, x="donor", y="detected",
colour_by="high_detected"))
print(plotColData(sce.obj, x="donor", y="subsets_Mito_percent",
colour_by="high_subsets_Mito_percent"))
## remove cells with high mt
sce.obj <- sce.obj[, !discarded$high_subsets_Mito_percent]
return(sce.obj)
}
##Filter
baron.sce <- FilterNQC(baron.sce, plot.path = figure.path)




Normalize <- function(sce.obj, sctransform=FALSE, plot.path=NULL)
{
## normalize by deconvolution
set.seed(100)
clust.sce <- quickCluster(sce.obj)
deconv.sf.sce <- calculateSumFactors(sce.obj, cluster=clust.sce)
colData(sce.obj) <- cbind(colData(sce.obj), clust.sce, deconv.sf.sce)
## plot comparison between size factors
# png(file=paste0(plot.path, "/size_factors.png"))
# plot(clust.sce, deconv.sf.sce)
# dev.off()
col_data <- as.data.frame(colData(sce.obj))
size_factor <- ggplot(col_data, aes(x = clust.sce, y = deconv.sf.sce)) +
geom_boxplot() +
labs(x = "Clusters", y = "Deconvolution size factor")
ggsave("size_factor.png", plot = size_factor, path = plot.path)
## transform to log normal counts using deconvolution size factor
sce.obj <- logNormCounts(sce.obj)
## challenge Q: alternative use of sctransform
return(sce.obj)
}
## Normalize
baron.sce <- Normalize(baron.sce, sctransform=FALSE, plot.path= figure.path)
FeatureSelection <-function(sce.obj, plot.path=NULL)
{
## model gene variance
dec <- modelGeneVar(sce.obj)
# dec.df <- as.data.frame(dec)
## plot mean- variadec.df
plot(dec$mean, dec$total, xlab="Mean log-expression", ylab="Variance")
print(curve(metadata(dec)$trend(x), col="blue", add=TRUE))
## select HVG either by number or FDR threshold
hvg <- getTopHVGs(dec, fdr.threshold=0.05)
return(hvg)
}
## Feature selection
baron.hvg <- FeatureSelection(baron.sce, plot.path = figure.path)

## $x
## [1] -5.551115e-17 5.382593e-02 1.076519e-01 1.614778e-01 2.153037e-01
## [6] 2.691296e-01 3.229556e-01 3.767815e-01 4.306074e-01 4.844333e-01
## [11] 5.382593e-01 5.920852e-01 6.459111e-01 6.997371e-01 7.535630e-01
## [16] 8.073889e-01 8.612148e-01 9.150408e-01 9.688667e-01 1.022693e+00
## [21] 1.076519e+00 1.130344e+00 1.184170e+00 1.237996e+00 1.291822e+00
## [26] 1.345648e+00 1.399474e+00 1.453300e+00 1.507126e+00 1.560952e+00
## [31] 1.614778e+00 1.668604e+00 1.722430e+00 1.776256e+00 1.830082e+00
## [36] 1.883907e+00 1.937733e+00 1.991559e+00 2.045385e+00 2.099211e+00
## [41] 2.153037e+00 2.206863e+00 2.260689e+00 2.314515e+00 2.368341e+00
## [46] 2.422167e+00 2.475993e+00 2.529819e+00 2.583645e+00 2.637470e+00
## [51] 2.691296e+00 2.745122e+00 2.798948e+00 2.852774e+00 2.906600e+00
## [56] 2.960426e+00 3.014252e+00 3.068078e+00 3.121904e+00 3.175730e+00
## [61] 3.229556e+00 3.283382e+00 3.337208e+00 3.391033e+00 3.444859e+00
## [66] 3.498685e+00 3.552511e+00 3.606337e+00 3.660163e+00 3.713989e+00
## [71] 3.767815e+00 3.821641e+00 3.875467e+00 3.929293e+00 3.983119e+00
## [76] 4.036945e+00 4.090771e+00 4.144596e+00 4.198422e+00 4.252248e+00
## [81] 4.306074e+00 4.359900e+00 4.413726e+00 4.467552e+00 4.521378e+00
## [86] 4.575204e+00 4.629030e+00 4.682856e+00 4.736682e+00 4.790508e+00
## [91] 4.844333e+00 4.898159e+00 4.951985e+00 5.005811e+00 5.059637e+00
## [96] 5.113463e+00 5.167289e+00 5.221115e+00 5.274941e+00 5.328767e+00
## [101] 5.382593e+00
##
## $y
## [1] NaN 0.0653179 0.1300124 0.1926598 0.2534547 0.3122463 0.3689155
## [8] 0.4233695 0.4755390 0.5253746 0.5728460 0.6179450 0.6606961 0.7012090
## [15] 0.7397540 0.7762576 0.8108621 0.8439199 0.8753240 0.9047061 0.9316127
## [22] 0.9560617 0.9784777 0.9992314 1.0187099 1.0370315 1.0542348 1.0702803
## [29] 1.0852924 1.0991929 1.1112657 1.1209015 1.1277813 1.1323933 1.1354633
## [36] 1.1365731 1.1363638 1.1350638 1.1336649 1.1317839 1.1273946 1.1188406
## [43] 1.1061021 1.0921572 1.0724377 1.0525400 1.0313591 1.0100198 0.9885689
## [50] 0.9675635 0.9475716 0.9273206 0.9067664 0.8861958 0.8659772 0.8470074
## [57] 0.8284294 0.8110407 0.7938844 0.7782651 0.7635266 0.7498072 0.7365415
## [64] 0.7241412 0.7132220 0.7033238 0.6964545 0.6888344 0.6817823 0.6747141
## [71] 0.6664680 0.6585203 0.6505743 0.6426918 0.6367655 0.6321084 0.6274923
## [78] 0.6234382 0.6196217 0.6165142 0.6133865 0.6102411 0.6071731 0.6047017
## [85] 0.6022128 0.5997082 0.5970636 0.5941421 0.5912137 0.5882797 0.5853413
## [92] 0.5824004 0.5794584 0.5765163 0.5735755 0.5706368 0.5677014 0.5647701
## [99] 0.5618439 0.5589236 0.5560100
Cluster <- function(sce.obj, hvg.obj, plot.path=NULL)
{
## remeber to set.seed for reproducible results
## moved up as scater manual suggested set seed for runPCA
set.seed(100)
## run PCA
sce.obj <- runPCA(sce.obj, subset_row=hvg.obj)
attr(reducedDim(sce.obj), "percentVar") <- attr(reducedDim(sce.obj),
"percentVar")/100
## select meaningful number of dimensions and plot
y <- attr(reducedDim(sce.obj), "percentVar")
x <- seq(1, length(y))
print(plot(x, y, main = "Variance Explained by Principal Components",
xlab = "PC", ylab = "Percent Variance Explained"))
## intended to collect PCs that explained 80% of var, but >50
# for (n in x){
# if (sum(y[1:n]) < 0.8){
# next
# } else {
# optimal_n <- n
# print(cat("The number of principal components that explain
# 80% of the variation are", optimal_))
# break
# }
# }
## reduce dim by PCA and plot
print(plotPCA(sce.obj, ncomponents = 7, colour_by = "label"))
## run TSNE and plot
## ran on pre-existing PCA results to speed via low rank approximation
## selected number of PCs to include using elbow method
sce.obj <- runTSNE(sce.obj, perplexity=50, dimred="PCA", n_dimred=7)
print(plotReducedDim(sce.obj, dimred = "TSNE", colour_by = "label"))
## run UMAP and plot
library(uwot)
sce.obj <- runUMAP(sce.obj)
print(plotReducedDim(sce.obj, dimred = "UMAP", colour_by = "label"))
## build shared nearest-neighbor graph and plot
g <- buildSNNGraph(sce.obj, use.dimred="PCA")
clusters <- igraph::cluster_louvain(g)$membership
sce.obj$clusters <- factor(clusters)
print(table(sce.obj$clusters))
print(plotReducedDim(sce.obj, dimred = "TSNE",
colour_by = "clusters", text_by = "clusters"))
## cluster modularity and heatmap
ratio <- clusterModularity(g, clusters, as.ratio=TRUE)
print(pheatmap(log10(ratio+1), cluster_cols=FALSE, cluster_rows=FALSE,
col=rev(heat.colors(100))))
## graph of clusters and plot
## not sure what's supposed to be here - thought I graphed SNN clusters above
return(sce.obj)
}
## cluster
baron.sce <- Cluster(baron.sce, baron.hvg, plot.path = figure.path)

## NULL

## Warning: package 'uwot' was built under R version 3.6.3
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
## The following object is masked from 'package:S4Vectors':
##
## expand


##
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
## 236 933 143 886 627 826 761 86 1133 263 459 809 237 952 11 207


MarkerGenes <- function(sce.obj, plot.path=NULL)
{
## Find stringent markers. Only genes that are unique to each cluster
## are identified. e.g. Insulin will be missed
markers_unique <- findMarkers(sce.obj, sce.obj$clusters, pval.type="all")
## find any markers
markers_any <- findMarkers(sce.obj, sce.obj$clusters, pval.type="any")
## plot insulin marker
INS_fold <- 1
for (n in seq(3, length(markers_any)+3-1)){
INS_fold <- c(INS_fold, markers_any[[1]][grep("^INS$",
rownames(markers_any[[1]])), n])
}
INS <- numeric(length=length(sce.obj$clusters))
for (n in seq(1, length(sce.obj$clusters))){
INS[n] <- INS_fold[sce.obj$clusters[n]]
}
sce.obj$INS <- INS
print(plotReducedDim(sce.obj, dimred = "TSNE",
colour_by = "INS", text_by = "clusters"))
## find and plot markers for a specific cluster
test <- as.data.frame(markers_unique[[1]][2], row.names = rownames(markers_unique))
clust1_heatmap <- as.matrix(markers_unique[[1]]
[which(test<0.05), 2:length(markers_unique)+1])
print(pheatmap(clust1_heatmap, main = "Cluster 1 logFC for DE Markers"))
return(markers_unique)
}
## Marker genes
baron.markers <- MarkerGenes(baron.sce, plot.path = figure.path)


AnnotateClusters <- function(sce.obj, type.markers, plot.path=NULL)
{
library(AUCell)
cells_rankings <- AUCell_buildRankings(assay(sce.obj))
library(GSEABase)
for (n in seq(1, dim(type.markers)[1])){
assign(paste0("gs", n),
GeneSet(type.markers$gene[n], setName=type.markers$cell[n]))
}
## didn't have time to generalize this
# names_gs <- paste("gs", c(1:dim(type.markers)[1]), sep = "")
gsc <- GeneSetCollection(gs1, gs2, gs3, gs4, gs5, gs6,
gs7, gs8, gs9, gs10, gs11, gs12)
## Annotate the clusters using type.markers
cells_AUC <- AUCell_calcAUC(gsc, cells_rankings,
aucMaxRank=nrow(cells_rankings)*0.05)
# par(mfrow=c(4,3))
cells_assignment <- AUCell_exploreThresholds(cells_AUC, plotHist=TRUE,
nCores=1, assign=TRUE)
selectedThresholds <- getThresholdSelected(cells_assignment)
# par(mfrow=c(4,3))
for (geneSetName in names(selectedThresholds)){
nBreaks <- 5 # Number of levels in the color palettes
## Color palette for the cells that do not pass the threshold
colorPal_Neg <- grDevices::colorRampPalette(c("black","blue", "skyblue"))(nBreaks)
## Color palette for the cells that pass the threshold
colorPal_Pos <- grDevices::colorRampPalette(c("pink", "magenta", "red"))(nBreaks)
## Split cells according to their AUC value for the gene set
passThreshold <- getAUC(cells_AUC)[geneSetName,] > selectedThresholds[geneSetName]
if(sum(passThreshold) >0 ){
aucSplit <- split(getAUC(cells_AUC)[geneSetName,], passThreshold)
## Assign cell color
cellColor <- c(setNames(colorPal_Neg[cut(aucSplit[[1]], breaks=nBreaks)],
names(aucSplit[[1]])), setNames(colorPal_Pos[cut(aucSplit[[2]],
breaks=nBreaks)],
names(aucSplit[[2]])))
plot(reducedDim(sce.obj, "TSNE"), main=geneSetName,
sub="Pink/red cells pass the threshold", xlab="", ylab="",
col=cellColor[rownames(reducedDim(sce.obj, "TSNE"))], pch=16)
}
}
}
## Annotate clusters
AnnotateClusters(baron.sce, cell.markers, plot.path = figure.path)

## min 1% 5% 10% 50% 100%
## 767.00 845.36 954.00 1086.00 1847.00 4922.00
























## save SCE object
# saveRDS(baron.sce, file = paste0("../data/BaronHumanSCE_", Sys.Date(), ".Rds"))